Joe Margolis Quinn Hodgman Max Menache Devinn Chi Graham Elliot
Topic: Baseball Research question 1: How have batting average trends compared over time to OPS+ (might change OPS+)? Playoff vs. regular season Team averages stats, comparing playoff and non-playoff teams Single player averages, comparing all stars and non-all stars
Research question 2: Finding trends in each of the typed of hits over time, comparing singles, doubles, triples, homers.
Research question 3: How do players usually perform after a mid-season trade?
Research question 4: Which type of hit (single, double, triple, HR) best correlates with the value of the player?
library(Lahman)
data(Batting)
data(Teams)
data(BattingPost)
library(tufte)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(tint)
##
## Attaching package: 'tint'
## The following objects are masked from 'package:tufte':
##
## margin_note, newthought, quote_footer, sans_serif
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(patchwork)
options(htmltools.dir.version = FALSE)
baseballReference = read_html("https://www.baseball-reference.com/leagues/majors/bat.shtml#teams_standard_batting")
year = baseballReference %>%
html_nodes("#teams_standard_batting a") %>%
html_text() %>%
strtoi()
ave = baseballReference %>%
html_nodes("#teams_standard_batting .right:nth-child(19)") %>%
html_text()
ops = baseballReference %>%
html_nodes("#teams_standard_batting .right:nth-child(22)") %>%
html_text()
RefhittingStats = tibble(Year = year, Average = as.numeric(ave), OPS = as.numeric(ops))
RefhittingStats = RefhittingStats %>%
filter(Year >= 1955) %>%
pivot_longer(c(Average, OPS), names_to = "category", values_to = "value")
ggplot(RefhittingStats, aes(x = Year, y = value, color = category)) +
geom_point() +
labs(title = "MLB Hitting Stats by Year") +
geom_smooth() +
facet_wrap(~category, scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
hittingStats <- Batting %>%
mutate(BA = H/AB) %>%
filter(yearID >=1955)
tradedOnly <- hittingStats %>%
group_by(yearID, playerID) %>%
mutate(num_entry = n()) %>% # add variable that equals the number of entries
ungroup() %>%
filter(num_entry > 1) %>%
na.omit()# keep rows that have 1 entry per year/name or if they have more than 1, make sure the team = 'TOT'
ggplot(tradedOnly, aes(y = BA, x = yearID, color = as.factor(stint))) +
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
tradedOnlyYearBA <- tradedOnly %>%
group_by(yearID, stint) %>%
summarise(yAB = sum(AB), yH = sum(H)) %>%
mutate(yBA = yH/yAB)
## `summarise()` has grouped output by 'yearID'. You can override using the `.groups` argument.
ggplot(tradedOnlyYearBA, aes(y = yBA, x = yearID, color = as.factor(stint))) +
geom_point()+
geom_smooth()+
labs(title = "Batting Averages before and after players were traded in each year", y="Combined Batting Averages", x = "Year", caption = "Figure 1: Combined batting averages for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
This shows that there is an increase in batting averages after a player is traded, but the increase is slowly closing
Postseason <- BattingPost %>%
filter(yearID > 1955) %>%
group_by(playerID, yearID, teamID) %>%
summarise(PG = sum(G), PAB = sum(AB), PR = sum(R), PH = sum(H), `P2B` = sum(`X2B`), `P3B` = sum(`X3B`), PHR = sum(HR), PRBI = sum(RBI), PSB = sum(SB), PCS = sum(CS), PBB = sum(BB), PSO = sum(SO), PIBB = sum(IBB), PHBP = sum(HBP), PSH = sum(SH), PSF = sum(SF), PGIDP = sum(GIDP)) %>%
na.omit()
## `summarise()` has grouped output by 'playerID', 'yearID'. You can override using the `.groups` argument.
ShowTrade <- hittingStats %>%
group_by(yearID, playerID) %>%
mutate(num_entry = n()) %>% # add variable that equals the number of entries
ungroup() %>%
na.omit()
PostShowTradeYear <- Postseason %>%
left_join(ShowTrade, by = c("playerID"="playerID", "yearID" = "yearID", "teamID" = "teamID"))%>%
group_by(yearID, num_entry) %>%
summarise(YPG = sum(PG), YPAB = sum(PAB), YPR = sum(PR), YPH = sum(PH), `YP2B` = sum(`P2B`), `YP3B` = sum(`P3B`), YPHR = sum(PHR), YPRBI = sum(PRBI), YPSB = sum(PSB), YPCS = sum(PCS), YPBB = sum(PBB), YPSO = sum(PSO), YPIBB = sum(PIBB), YPHBP = sum(PHBP), YPSH = sum(PSH), YPSF = sum(PSF), YPGIDP = sum(PGIDP)) %>%
mutate(YPBA = YPH/YPAB) %>%
na.omit()
## `summarise()` has grouped output by 'yearID'. You can override using the `.groups` argument.
ggplot(PostShowTradeYear, aes(y = YPBA, x = yearID, color = as.factor(num_entry))) +
geom_point()+
geom_smooth()+
labs(title = "Playoff Batting Averages of players traded vs. Not traded", y="Combined Playoff Batting Averages", x = "Year", caption = "Figure 1: Combined batting averages for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
tradedOnlyYearOPS <- tradedOnly %>%
mutate(PA = AB + BB + HBP + SF + SH) %>%
mutate(TOB = H + BB + HBP) %>%
mutate(Singles = H - `X2B` - `X3B` - `HR`) %>%
mutate(TB = Singles + (2*`X2B`) + (3*`X3B`)+ (4*HR)) %>%
group_by(yearID, stint) %>%
summarise(yPA = sum(PA), yTOB = sum(TOB), yTB = sum(TB), yAB = sum(AB)) %>%
mutate(yOBP = yTOB/yPA) %>%
mutate(ySLG = yTB/yAB) %>%
mutate(yOPS = yOBP + ySLG)
## `summarise()` has grouped output by 'yearID'. You can override using the `.groups` argument.
PostShowTradeYearOPS <- PostShowTradeYear %>%
mutate(YPPA = YPAB + YPBB + YPHBP + YPSF + YPSH) %>%
mutate(YPTOB = YPH + YPBB + YPHBP) %>%
mutate(YPSingles = YPH - `YP2B` - `YP3B` - `YPHR`) %>%
mutate(YPTB = YPSingles + (2*`YP2B`) + (3*`YP3B`)+ (4*YPHR)) %>%
mutate(YPOBP = YPTOB/YPPA) %>%
mutate(YPSLG = YPTB/YPAB) %>%
mutate(YPOPS = YPOBP + YPSLG)
ggplot(tradedOnlyYearOPS, aes(y = yOBP, x = yearID, color = as.factor(stint))) +
geom_point()+
geom_smooth()+
labs(title = "On Base Percentages before and after players were traded in each year", y="Combined On Base Percentages", x = "Year", caption = "Figure 1: Combined On Baseb Percentages for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(tradedOnlyYearOPS, aes(y = ySLG, x = yearID, color = as.factor(stint))) +
geom_point()+
geom_smooth()+
labs(title = "Slugging Percentages before and after players were traded in each year", y="Combined Slugging Percentages", x = "Year", caption = "Figure 1: Combined Slugging Percentages for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(tradedOnlyYearOPS, aes(y = yOPS, x = yearID, color = as.factor(stint))) +
geom_point()+
geom_smooth()+
labs(title = "OPS before and after players were traded in each year", y="Combined OPS", x = "Year", caption = "Figure 1: Combined OPS for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The following graphs show the same trends in these other stats as batting average did.
ggplot(PostShowTradeYearOPS, aes(y = YPOBP, x = yearID, color = as.factor(num_entry))) +
geom_point()+
geom_smooth()+
labs(title = "Playoff On Base Percentage of players traded vs. Not traded", y="Combined Playoff On Base Percentage", x = "Year", caption = "Figure 1: Combined on base percentage for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(PostShowTradeYearOPS, aes(y = YPSLG, x = yearID, color = as.factor(num_entry))) +
geom_point()+
geom_smooth()+
labs(title = "Playoff Slugging Percentage of players traded vs. Not traded", y="Combined Playoff Slugging Percentage", x = "Year", caption = "Figure 1: Combined slugging percentage for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(PostShowTradeYearOPS, aes(y = YPOPS, x = yearID, color = as.factor(num_entry))) +
geom_point()+
geom_smooth()+
labs(title = "Playoff On Base Plus Slugging Percentage of players traded vs. Not traded", y="Combined Playoff On Base Plus Slugging Percentage", x = "Year", caption = "Figure 1: Combined on base plus slugging percentage for people before they were traded and after thay were traded each year.", color = "Stint")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
HitStatsYear <- Batting %>%
mutate(BA = H/AB) %>%
filter(yearID >=1955) %>%
filter(yearID <= 2019) %>%
filter(AB >= 100) %>%
mutate(singles = H - `X2B` - `X3B` - HR) %>%
group_by(yearID) %>%
summarise(singles = sum(singles), doubles = sum(`X2B`), triples = sum(`X3B`), HR = sum(HR)) %>%
pivot_longer(c(singles, doubles, triples, HR), names_to = "HitType", values_to = "Total", names_prefix = "Total_")
R1 <- ggplot(HitStatsYear, aes(x=yearID, y= Total, color = HitType)) +
geom_line() +
geom_point() +
labs(x = "Year",
y = "",
title = "Number of Hits Per Type of Hit")
R1
R2 <- ggplot(HitStatsYear, aes(x=yearID, y= Total, fill = HitType)) +
geom_col(position = "fill")+
labs(x = "Year",
y = "",
title = "Variation of Hits Shown as a Percentage")
R2
Everything shows similar overall trends but in the last few years singles have decreased drastically towards the other types of hits. Other than that thought peaks and lows are similar across all types of hits.
HitStatsYearPost <- BattingPost %>%
mutate(BA = H/AB) %>%
filter(yearID >=1955) %>%
filter(yearID <= 2019) %>%
mutate(singles = H - `X2B` - `X3B` - HR) %>%
group_by(yearID) %>%
summarise(singles = sum(singles), doubles = sum(`X2B`), triples = sum(`X3B`), HR = sum(HR)) %>%
pivot_longer(c(singles, doubles, triples, HR), names_to = "HitType", values_to = "Total", names_prefix = "Total_")
P1 <- ggplot(HitStatsYearPost, aes(x=yearID, y= Total, color = HitType)) +
geom_line() +
geom_point()+
labs(x = "Year",
y = "",
title = "Number of Hits Per Type of Hit (Postseason)")
P1
P2 <- ggplot(HitStatsYearPost, aes(x=yearID, y= Total, fill = HitType)) +
geom_col(position = "fill") +
labs(x = "Year",
y = "",
title = "Variation of Hits Shown as a Percentage (Postseason")
P2
All <- (R1/R2) | (P1/P2)
All+plot_annotation(title = 'Comparing Hits Per Year as a Whole and in The Postseason')
Research question 4: Which type of hit (single, double, triple, HR) best correlates with the value of the player?
Fangraphs <- read_csv("Fangraphs.csv")
## Rows: 8606 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Team
## dbl (7): Season, WAR, 1B, 2B, 3B, HR, playerid
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(Fangraphs) <- c('Season','Name', 'Team', 'WAR', 'Single', 'Double', 'Triple', 'HR', 'playerid')
ggplot(Fangraphs, aes(x = Single, y = WAR)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Fangraphs, aes(x = Double, y = WAR)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Fangraphs, aes(x = Triple, y = WAR)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Fangraphs, aes(x = HR, y = WAR)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
These graphs show how each type of hit correlates with WAR. WAR is a stat that is best predictive of a player’s value, showing how many Team Wins they are worth when compared to a replacement-level player. Based on these graphs, it shows that doubles and home runs are generally more predictive of a player’s value rather than triples and singles. In baseball today, teams prioritize players who hit a lot of home runs and doubles, so this backs that up, showing that, generally, the more doubles and home runs that a player hits, the more valuable the player is to his team.